#!/user/bin/env python
# coding:utf-8
import urllib2
import re
import os


class Spider():
    # 构造方法
    def __init__(self):
        self.url = "http://www.qiushibaike.com/8hr/page/%s/?s=4966067"
        # self.url = "http://www.bossidc.com/qiubai/%s.htm"
        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"

    # 获取网页源码
    def get_page(self, pageIndex):
        headers = {"User-Agent": self.user_agent}
        try:
            request = urllib2.Request(url=self.url % str(pageIndex), headers=headers)
            response = urllib2.urlopen(request)
            content = response.read()
            return content
        except urllib2.HTTPError as e:
            print  e
            exit()
        except urllib2.URLError as e:
            print "网络无法访问..."
            exit()

    # 分析网页源代码
    def analysis(self, content):
        # patt = '<div class="content">(.*?)<!--(.*?)-->.*?</div>'
        patt1 = '<a href="/article/(.*?)".*?(target="_blank" class=\'contentHerf\')? >.*?<div class="content">.*?<span>(.*?)</span>.*?</div>.*?</a>'
        pattern = re.compile(patt1, re.S)
        items = re.findall(pattern,content)
        return items

    # 保存的内容
    def save(self, items, path):
        for item in items:
            newitem = item[2].replace("\n", '').replace('<br/>', '\n')
            path = "qiubai"
            if not os.path.exists(path):
                os.mkdir(path)
            file_path = path + "/" + item[0] + ".txt"
            f = open(file_path, 'w')
            f.write(newitem)
            f.close()
            print "写入" + item[0] + ".txt"

    def run(self):
        print  "开始抓取..."
        for i in range(1, 4):
            content = self.get_page(i)
            items = self.analysis(content)
            self.save(items, "qiubai")
        print  "结束抓取..."


if __name__ == "__main__":
    spider = Spider()
    spider.run()


# print  "开始抓取..."
# for i in range(1, 4):
#     url = "http://www.bossidc.com/qiubai/" + str(i) + ".htm"
#     headers = {
#         "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36"}
#     try:
#         request = urllib2.Request(url=url, headers=headers)
#         response = urllib2.urlopen(request)
#         content = response.read()
#     except urllib2.HTTPError as e:
#         print  e
#         exit()
#     except urllib2.URLError as e:
#         print "网络无法访问..."
#         exit()
#     pattern = re.compile('<div class="content">(.*?)<!--(.*?)-->.*?</div>', re.S)
#     items = re.findall(pattern, content)
#     for item in items:
#         # print item[0]
#         # print item[1]
#         newitem = item[0].replace("\n", '').replace('<br/>', '\n')
#
#         path = "qiubai"
#         if not os.path.exists(path):
#             os.mkdir(path)
#         file_path = path + "/" + item[1] + ".txt"
#         f = open(file_path, 'w')
#         f.write(newitem)
#         f.close()
#         print "写入" + item[1] + ".txt"
#
# print  "结束抓取..."
